#coding=utf-8
import urllib,re

html_charset = ('charset=gb2312', 'charset=utf-8')
re_bookmark = r'<a.*?onclick="addBookmark\(\);return false;".*?a>'

url ='http://pindao.huoban.taobao.com/channel/%s'
#re_h = r'href="beauty.htm\?pid=mm_12837376_0_0.*?"'
re_h = r'href="%s.*?"'

html_nav = (
  ('channel_code.htm\?pid=mm_12837376_0_0', '/zhonghe.html'),
  ('channel_mall.htm\?pid=mm_12837376_0_0', '/mall.html'),
  ('lady.htm\?pid=mm_12837376_0_0', '/lady.html'),
  ('beauty.htm\?pid=mm_12837376_0_0', '/beauty.html'),
  ('jewelry.htm\?pid=mm_12837376_0_0', '/jewlry.html'),
  ('man.htm\?pid=mm_12837376_0_0', '/man.html'),
  ('baby.htm\?pid=mm_12837376_0_0', '/baby.html'),
  ('digital.htm\?pid=mm_12837376_0_0', '/digital.html'),
  ('brand_lib.htm\?pid=mm_12837376_0_0', '/brand.html'),
  ('shop_street.htm\?pid=mm_12837376_0_0', '/shops.html'),
)

download_urls =[ [url%i[0].replace('\\', ''), i[1].replace('/', '')] for i in html_nav]

kissy_css_file = 'kissy.css'
fandongdong_css_file = 'fandongdong.css'
css_html = '''
<link rel="stylesheet" type="text/css" href="%s" />
<link rel="stylesheet" type="text/css" href="%s" />
'''%(kissy_css_file, fandongdong_css_file)

def get_pidndao_html(url):
  return urllib.urlopen(url).read()

def css2file(html_content, css_file='pindao.css'):
  m = re.search(r'<style type="text/css">(.*)</style>', h, re.DOTALL)
  css = m.group(1)
  open(css_file, 'w').write(css)
    
def pocess_pindao_html(html_content):
  #字符编码
  h = html_content.decode('gb2312', 'ignore').encode('utf-8', 'ignore')  
  h = h.replace(*html_charset)
  #print 'charset:u8'
  
  #改变css
  matched = re.search(r'<style type="text/css">.*</style>', h, re.DOTALL).group()
  h = h.replace(matched, css_html)
  #print 'css'
  
  #去掉收藏
  h = re.sub(re_bookmark, '', h)
  #print 'remove bookmark'
  
  #改变导航
  for n in html_nav:
    a, b = n
    h = re.sub(r'href="%s.*?"'%a, 'href="%s"'%b, h)
  #print 'nav' 
  return h




